Solution without Pandas


In [ ]:
# Copy the file 
!curl https://raw.githubusercontent.com/TeachingDataScience/datasets/master/nyt1.csv > nyt1.csv

In [ ]:
# %load nytimes_counter.py

# Import required libraries
import csv

# Start a counter and store the textfile in memory
gender_counter = {}
age_counter = {}

lines = csv.reader(open('nyt1.csv'))
lines.next()

# For each line, fill in the counters
for line in lines:
    age, gender, impressions, clicks, signed_in = line
    if str(gender) not in gender_counter:
        gender_counter[gender] = 0
    gender_counter[gender] += 1
    if str(age) not in age_counter:
        age_counter[age] = 0
    age_counter[age] += 1
    
print "Gender 0: ", gender_counter['0']
print "Gender 1: ", gender_counter['1']
print "Ages: "
print age_counter

In [4]:
# Run the file
!python nytimes_counter.py


Gender 0:  290176
Gender 1:  168265
Ages: 
{'24': 3158, '25': 3260, '26': 6405, '27': 6551, '20': 6337, '21': 6384, '22': 6394, '23': 6488, '28': 6359, '29': 6379, '0': 137106, '8': 15, '59': 4900, '58': 4916, '55': 2502, '54': 3515, '57': 5014, '56': 5034, '51': 7147, '50': 6982, '53': 7231, '52': 7165, '88': 155, '89': 128, '82': 548, '83': 483, '80': 751, '81': 618, '86': 228, '87': 211, '84': 369, '85': 298, '7': 5, '108': 1, '102': 2, '103': 2, '100': 4, '101': 4, '107': 2, '104': 1, '39': 7804, '38': 7891, '33': 6431, '32': 6443, '31': 6439, '30': 6617, '37': 7983, '36': 7933, '35': 3932, '34': 3290, '60': 4880, '61': 5012, '62': 5006, '63': 4950, '64': 2524, '65': 1067, '66': 2355, '67': 2250, '68': 2299, '69': 2015, '99': 9, '98': 15, '91': 72, '90': 106, '93': 42, '92': 66, '95': 35, '94': 49, '97': 14, '96': 27, '11': 283, '10': 113, '13': 1059, '12': 586, '15': 2605, '14': 1718, '17': 3953, '16': 3443, '19': 6509, '18': 5424, '48': 7277, '49': 7054, '46': 7202, '47': 7070, '44': 4006, '45': 3645, '42': 7919, '43': 7784, '40': 7702, '41': 7906, '9': 48, '77': 1161, '76': 1257, '75': 1473, '74': 1481, '73': 1681, '72': 1820, '71': 1860, '70': 2033, '79': 825, '78': 936}

Note: Using %load allows a file to be loaded for running in the notebook

Solution using Pandas


In [5]:
import pandas as pd

fileurl = "https://raw.githubusercontent.com/TeachingDataScience/datasets/master/nyt1.csv"

nyt = pd.read_csv(fileurl)

print "Gender 0: ", len(nyt[nyt.Gender == 0])
print "Gender 1: ", len(nyt[nyt.Gender == 1])
print nyt.groupby('Age').Age.count()


Gender 0:  290176
Gender 1:  168265
Age
0      137106
7           5
8          15
9          48
10        113
11        283
12        586
13       1059
14       1718
15       2605
16       3443
17       3953
18       5424
19       6509
20       6337
21       6384
22       6394
23       6488
24       3158
25       3260
26       6405
27       6551
28       6359
29       6379
30       6617
31       6439
32       6443
33       6431
34       3290
35       3932
        ...  
77       1161
78        936
79        825
80        751
81        618
82        548
83        483
84        369
85        298
86        228
87        211
88        155
89        128
90        106
91         72
92         66
93         42
94         49
95         35
96         27
97         14
98         15
99          9
100         4
101         4
102         2
103         2
104         1
107         2
108         1
Name: Age, dtype: int64

Note: How would we make this re-usable?